import findspark
findspark.init()
findspark.find()
'C:\\Users\\hag8665\\Anaconda3\\envs\\new_environment\\lib\\site-packages\\pyspark'
# !pip install seaborn
# !pip install sklearn
# !pip install folio
# !pip install folium
import pandas as pd
import numpy as np
import seaborn as sns
import pyspark
from pyspark import SparkContext, SparkConf,StorageLevel,SQLContext
from platform import python_version
print(python_version())
3.6.13
from collections import namedtuple
from pyspark.sql import SparkSession
# sc = SparkContext.getOrCreate()
# sqlContext = SQLContext(sc)
spark = SparkSession.builder.appName("test").getOrCreate()
C:\Users\hag8665\Anaconda3\envs\new_environment\lib\site-packages\pyspark\context.py:238: FutureWarning: Python 3.6 support is deprecated in Spark 3.2. FutureWarning
filePath = r"C:\Users\hag8665\Desktop\MSDS 436\Assignment3\airbnb.parquet"
airbnbDF = spark.read.parquet(filePath)
airbnbDF.columns
['host_is_superhost', 'cancellation_policy', 'instant_bookable', 'host_total_listings_count', 'neighbourhood_cleansed', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'minimum_nights', 'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'price', 'bedrooms_na', 'bathrooms_na', 'beds_na', 'review_scores_rating_na', 'review_scores_accuracy_na', 'review_scores_cleanliness_na', 'review_scores_checkin_na', 'review_scores_communication_na', 'review_scores_location_na', 'review_scores_value_na']
airbnbDF.select("neighbourhood_cleansed", "room_type", "bedrooms", "bathrooms",
"number_of_reviews", "price").show(5)
+----------------------+---------------+--------+---------+-----------------+-----+ |neighbourhood_cleansed| room_type|bedrooms|bathrooms|number_of_reviews|price| +----------------------+---------------+--------+---------+-----------------+-----+ | Western Addition|Entire home/apt| 1.0| 1.0| 180.0|170.0| | Bernal Heights|Entire home/apt| 2.0| 1.0| 111.0|235.0| | Haight Ashbury| Private room| 1.0| 4.0| 17.0| 65.0| | Haight Ashbury| Private room| 1.0| 4.0| 8.0| 65.0| | Western Addition|Entire home/apt| 2.0| 1.5| 27.0|785.0| +----------------------+---------------+--------+---------+-----------------+-----+ only showing top 5 rows
type(airbnbDF)
pyspark.sql.dataframe.DataFrame
# get data info
airbnbDF.printSchema()
root |-- host_is_superhost: string (nullable = true) |-- cancellation_policy: string (nullable = true) |-- instant_bookable: string (nullable = true) |-- host_total_listings_count: double (nullable = true) |-- neighbourhood_cleansed: string (nullable = true) |-- latitude: double (nullable = true) |-- longitude: double (nullable = true) |-- property_type: string (nullable = true) |-- room_type: string (nullable = true) |-- accommodates: double (nullable = true) |-- bathrooms: double (nullable = true) |-- bedrooms: double (nullable = true) |-- beds: double (nullable = true) |-- bed_type: string (nullable = true) |-- minimum_nights: double (nullable = true) |-- number_of_reviews: double (nullable = true) |-- review_scores_rating: double (nullable = true) |-- review_scores_accuracy: double (nullable = true) |-- review_scores_cleanliness: double (nullable = true) |-- review_scores_checkin: double (nullable = true) |-- review_scores_communication: double (nullable = true) |-- review_scores_location: double (nullable = true) |-- review_scores_value: double (nullable = true) |-- price: double (nullable = true) |-- bedrooms_na: double (nullable = true) |-- bathrooms_na: double (nullable = true) |-- beds_na: double (nullable = true) |-- review_scores_rating_na: double (nullable = true) |-- review_scores_accuracy_na: double (nullable = true) |-- review_scores_cleanliness_na: double (nullable = true) |-- review_scores_checkin_na: double (nullable = true) |-- review_scores_communication_na: double (nullable = true) |-- review_scores_location_na: double (nullable = true) |-- review_scores_value_na: double (nullable = true)
get_numericcol = airbnbDF.dtypes
#get all columns
col=[]
for i in get_numericcol:
col.append(i[0])
col[:-10]
['host_is_superhost', 'cancellation_policy', 'instant_bookable', 'host_total_listings_count', 'neighbourhood_cleansed', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'minimum_nights', 'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'price']
# drop columns with na
airbnbDF = airbnbDF.select(col[:-10])
get_numericcol = airbnbDF.dtypes
numcol=[]
for i in get_numericcol:
if i[1] != 'string':
numcol.append(i[0])
# summary of key numeirca columns
airbnbDF.select("neighbourhood_cleansed", "room_type", "bedrooms", "bathrooms",
"number_of_reviews", "price").describe().show()
+-------+----------------------+---------------+------------------+------------------+-----------------+------------------+ |summary|neighbourhood_cleansed| room_type| bedrooms| bathrooms|number_of_reviews| price| +-------+----------------------+---------------+------------------+------------------+-----------------+------------------+ | count| 7146| 7146| 7146| 7146| 7146| 7146| | mean| null| null|1.3427092079485026|1.3280856423173804|43.55387629443045|213.30982367758187| | stddev| null| null|0.9328545736962833|0.7937873897311164|72.53848066174737| 311.3754991591505| | min| Bayview|Entire home/apt| 0.0| 0.0| 0.0| 10.0| | max| Western Addition| Shared room| 14.0| 14.0| 677.0| 10000.0| +-------+----------------------+---------------+------------------+------------------+-----------------+------------------+
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
# create column list of numeric columns
columnList = [item[0] for item in airbnbDF.dtypes if item[1].startswith('double')]
pdf = airbnbDF.toPandas()
# scatter_matrix(pdf)
#airbnbDF.plot(kind='box', subplot= 'True')
from folio import Folio
import folium
geolocdf= airbnbDF.select("latitude", "longitude")
cordinate = geolocdf.rdd.map(lambda x: x).collect()
basemap = folium.Map(location=[37.77, -122.43], zoom_start=11, prefer_canvas=True)
#fg = folium.FeatureGroup(name='airbnb')
for i in cordinate:
folium.CircleMarker(location=[i[0],i[1]],
radius=0.5,
weight=1).add_to(basemap)
basemap.save('map.html')
del(basemap)
#fg.add_child(folium.Marker(location=[i[0],i[1]]))
# finalmap = basemap.add_child(fg)
# finalmap.save( "map.html" )
# del(finalmap)
%%HTML
<iframe width='100%' height='350' src = "map.html"></iframe>
from pyspark.mllib.stat import Statistics
# result can be used w/ seaborn's heatmap
def compute_correlation_matrix(df, method='pearson'):
# wrapper around
# https://forums.databricks.com/questions/3092/how-to-calculate-correlation-matrix-with-all-colum.html
df_rdd = df.rdd.map(lambda row: row[0:])
corr_mat = Statistics.corr(df_rdd, method=method)
corr_mat_df = pd.DataFrame(corr_mat,
columns=df.columns,
index=df.columns)
return corr_mat_df
corr = compute_correlation_matrix(airbnbDF.select(numcol))
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
sns.heatmap(corr, cmap=cmap, vmax=0.2, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": 1}).set(title='Correlation Plot numeric columns')
[Text(0.5, 1.0, 'Correlation Plot numeric columns')]
# ! pip install plotly
# !pip install chart-studio
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()
import plotly.io as pio
pio.renderers.default='notebook'
import plotly.express as px
# price distribution plot
px.histogram(airbnbDF.toPandas()['price'])
px.box(airbnbDF.select("price","neighbourhood_cleansed").toPandas(),
x= "neighbourhood_cleansed", y ="price")
from pyspark.sql.functions import sum,avg,max,min,mean,count
airbnbDF.groupBy("bedrooms").agg(mean("price").alias("mean_price"),max("price").alias("Max_price"),min("price").alias("Min_price")).show()
+--------+------------------+---------+---------+ |bedrooms| mean_price|Max_price|Min_price| +--------+------------------+---------+---------+ | 0.0|149.35696517412936| 3050.0| 43.0| | 7.0| 1447.5| 1500.0| 1395.0| | 1.0|148.94070016670636| 8000.0| 10.0| | 4.0| 562.6| 3394.0| 120.0| | 14.0| 69.0| 69.0| 69.0| | 3.0|431.46251993620416| 9000.0| 55.0| | 2.0|283.09815950920245| 10000.0| 50.0| | 6.0|1830.7777777777778| 8000.0| 316.0| | 5.0| 849.76| 2000.0| 260.0| +--------+------------------+---------+---------+
airbnbDF.groupBy("bathrooms").agg(mean("price").alias("mean_price"),max("price").alias("Max_price"),min("price").alias("Min_price")).show()
+---------+------------------+---------+---------+ |bathrooms| mean_price|Max_price|Min_price| +---------+------------------+---------+---------+ | 8.0| 67.4| 85.0| 50.0| | 0.0| 91.51282051282051| 900.0| 32.0| | 3.5| 805.3148148148148| 8000.0| 33.0| | 4.5| 804.5| 2250.0| 260.0| | 2.5|452.80924855491327| 1800.0| 30.0| | 1.0|169.69543245869775| 8000.0| 10.0| | 4.0| 347.96875| 1395.0| 35.0| | 0.5|102.17647058823529| 200.0| 45.0| | 14.0| 69.0| 69.0| 69.0| | 3.0|427.97727272727275| 8000.0| 34.0| | 2.0| 327.6208955223881| 10000.0| 31.0| | 1.5|232.98333333333332| 4500.0| 35.0| | 10.0|136.58333333333334| 555.0| 38.0| | 6.0|62.333333333333336| 100.0| 50.0| | 5.0|194.30769230769232| 2282.0| 45.0| | 5.5| 800.0| 800.0| 800.0| +---------+------------------+---------+---------+
airbnbDF.groupBy("beds").agg(mean("price").alias("mean_price"),max("price").alias("Max_price"),min("price").alias("Min_price")).show()
+----+------------------+---------+---------+ |beds| mean_price|Max_price|Min_price| +----+------------------+---------+---------+ | 8.0| 575.0| 1850.0| 40.0| | 0.0|278.82758620689657| 3394.0| 80.0| | 7.0| 1056.148148148148| 8000.0| 260.0| | 1.0|144.44892197125256| 8000.0| 10.0| | 4.0|413.66992665036673| 9000.0| 28.0| |11.0| 316.0| 316.0| 316.0| |14.0| 69.0| 69.0| 69.0| | 3.0|326.61348005502066| 3000.0| 40.0| | 2.0|223.43259668508287| 3800.0| 29.0| |10.0| 384.8| 800.0| 34.0| | 6.0|441.10869565217394| 1500.0| 145.0| | 5.0| 531.672131147541| 10000.0| 45.0| | 9.0| 549.5| 599.0| 500.0| |12.0| 2250.0| 2250.0| 2250.0| +----+------------------+---------+---------+
airbnbDF.groupBy("number_of_reviews").agg(mean("price").alias("mean_price"),max("price").alias("Max_price"),min("price").alias("Min_price")).show()
+-----------------+------------------+---------+---------+ |number_of_reviews| mean_price|Max_price|Min_price| +-----------------+------------------+---------+---------+ | 299.0| 153.0| 260.0| 89.0| | 305.0| 114.0| 133.0| 95.0| | 184.0|111.83333333333333| 160.0| 79.0| | 170.0| 169.875| 499.0| 79.0| | 576.0| 120.0| 120.0| 120.0| | 147.0| 132.25| 181.0| 79.0| | 160.0|122.54545454545455| 265.0| 55.0| | 169.0|241.85714285714286| 699.0| 110.0| | 608.0| 120.0| 120.0| 120.0| | 8.0| 260.0388349514563| 1450.0| 29.0| | 67.0| 168.9| 269.0| 65.0| | 70.0| 146.0| 400.0| 70.0| | 168.0| 133.0| 200.0| 80.0| | 69.0|124.85714285714286| 275.0| 65.0| | 0.0|226.49310094408133| 10000.0| 19.0| | 206.0| 130.0| 213.0| 55.0| | 390.0| 79.0| 79.0| 79.0| | 7.0|274.47222222222223| 8000.0| 30.0| | 677.0| 79.0| 79.0| 79.0| | 249.0| 160.5| 201.0| 120.0| +-----------------+------------------+---------+---------+ only showing top 20 rows